full_data <- readRDS('data/full_data.rds')
prod_by_month <- full_data %>%
select(yearmonth, active_1km, monthly_oil_1km, monthly_gas_1km,
monthly_water_1km, monthly_boe_1km, active_2p5km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_water_2p5km, monthly_boe_2p5km, active_5km,
monthly_oil_5km, monthly_gas_5km, monthly_water_5km, monthly_boe_5km) %>%
distinct() %>%
filter(yearmonth <= '2023-03')
coef <- median(1032*prod_by_month$monthly_oil_1km/10^9)/median(prod_by_month$monthly_gas_1km/10^6)
full_data %>%
ggplot(aes(x = yearmonth, group = 1)) +
geom_line(aes(y = monthly_oil_1km/10^6, color = 'Oil'), size=1) +
geom_line(aes(y = 1032*monthly_gas_1km/10^9/coef, color = 'Natural Gas'), size=1) +
scale_y_continuous(
name = "Monthly Oil in Million Barrels",
sec.axis = sec_axis(~.*coef, name="Monthly Gas in Billion Cubic Feet")
) +
theme(
axis.title.y = element_text(color = 'tomato'),
axis.title.y.right = element_text(color = 'skyblue')
) +
scale_color_manual(values = c("skyblue", "tomato")) +
scale_x_discrete(breaks = prod_by_month$yearmonth[seq(1, length(prod_by_month$yearmonth), by = 7)]) +
xlab("Date") + labs(colour="Production Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 63301 rows containing missing values (`geom_line()`).
## Removed 63301 rows containing missing values (`geom_line()`).
H2S_dm <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 123 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1724: `day = 2020-08-22`, `Monitor = "Judson"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 122 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
mutate(yearmonth = format(day, "%Y-%m")) %>%
group_by(yearmonth, Monitor) %>%
summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
scale_y_continuous(limits=c(0, 50)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')
The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.
# Compute daily average
H2S_da <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_da_graph <- H2S_da %>%
ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Average H2S concentration by monitor", y = 'Daily Average H2S', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_da_graph) %>% layout(dragmode = 'pan')
H2S_da_graph_b <- H2S_da %>%
ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
geom_line() +
labs(title = "Daily Average H2S concentration by monitor w/o outlier", y = 'Daily Avearge H2S', x = 'time') +
scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
full_data %>%
polarPlot(pollutant = "H2S", col = "turbo",
key.position = "bottom",
key.header = "mean H2S",
key.footer = NULL, title = 'hi')
# Create a polar map
# TBD: include markers for refineries
polarplot <- polarMap(
full_data %>%
filter(!(is.na(latitude.x) | is.na(H2S) | is.na(wd) | is.na(ws))) %>%
rename(date = DateTime),
pollutant = "H2S",
latitude = "latitude.x",
longitude = "longitude.x",
popup = "Monitor",
provider = "Esri.WorldImagery",
d.icon = 150,
d.fig = 2.5,
alpha = 0.75
)
##
Creating Polar Markers â– â– â– â– â– â– 15% | ETA: 10s
Creating Polar Markers â– â– â– â– â– â– â– â– 23%
## | ETA: 9s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– 31% | ETA: 7s
Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– 38% | ETA: 7s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 46% | ETA:
## 5s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 54% | ETA: 5s
Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 62% | ETA: 4s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–
## 69% | ETA: 3s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 77% | ETA:
## 2s
Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 85% | ETA: 2s
Creating Polar
## Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 92% | ETA: 1s
polarplot
#saveWidget(polarplot, file="polarplot.html")
# Create variable to indicate downwind or upwind
full_data <- full_data %>%
mutate(downwind = abs(Converted_Angle - 180 - wd) <= 15)
# Compute daily average wd/ws
daily_full <- full_data %>%
select(H2S, ws, wd, latitude.x, longitude.x, Monitor, MinDist,
Converted_Angle, Refinery, year, month, day, weekday,
monthly_oil_1km, monthly_gas_1km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km,
dist_wrp, capacity, active_1km, active_2p5km, active_5km) %>%
group_by(Monitor, day) %>%
mutate(H2S_daily_max = max(H2S, na.rm=TRUE),
H2S_daily_avg = mean(H2S, na.rm=TRUE),
ws_avg = mean(ws, na.rm=TRUE),
wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
mutate(daily_downwind = abs(Converted_Angle - 180 - wd_avg) <= 15) %>%
ungroup() %>%
rename(monitor_lat = latitude.x, monitor_lon = longitude.x) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max)) %>%
select(-c(H2S, wd, ws)) %>%
distinct()
## Warning: There were 2449 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1637: `Monitor = "First Methodist"`, `day = 2022-11-23`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2448 remaining warnings.
h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.172e+00 3.213e-03 364.72 <2e-16 ***
## year2023 -3.263e-01 2.800e-03 -116.54 <2e-16 ***
## wd_secQ2 -2.560e-01 3.751e-03 -68.23 <2e-16 ***
## wd_secQ3 -2.898e-01 3.892e-03 -74.46 <2e-16 ***
## wd_secQ4 -2.095e-01 3.492e-03 -59.99 <2e-16 ***
## ws -5.424e-02 4.176e-04 -129.91 <2e-16 ***
## I(1/MinDist^2) 2.114e+05 2.329e+03 90.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.996 8 2496 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0946 Deviance explained = 9.47%
## GCV = 0.92965 Scale est. = 0.92963 n = 772251
plot(h2s_model_a)
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
s(latitude.x, longitude.x, bs='tp', k = 8),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) +
## s(latitude.x, longitude.x, bs = "tp", k = 8)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.012e+00 3.171e-03 319.16 <2e-16 ***
## wd_secQ2 -1.954e-01 3.696e-03 -52.87 <2e-16 ***
## wd_secQ3 -1.822e-01 3.905e-03 -46.65 <2e-16 ***
## wd_secQ4 -1.129e-01 3.487e-03 -32.39 <2e-16 ***
## ws -6.921e-02 4.147e-04 -166.90 <2e-16 ***
## I(1/MinDist^2) 3.002e+05 2.858e+03 105.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 1708 <2e-16 ***
## s(latitude.x,longitude.x) 6.999 7 6523 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.13 Deviance explained = 13%
## GCV = 0.8932 Scale est. = 0.89317 n = 772251
plot(h2s_model_b)
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws +
I(1/MinDist^2) + Converted_Angle +
s(latitude.x, longitude.x, bs='tp', k = 10) +
as.factor(weekday),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2) + Converted_Angle + s(latitude.x, longitude.x,
## bs = "tp", k = 10) + as.factor(weekday)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.420e+00 2.374e-02 101.947 < 2e-16 ***
## year2023 -3.327e-01 2.864e-03 -116.135 < 2e-16 ***
## wd_secQ2 -1.858e-01 3.643e-03 -51.011 < 2e-16 ***
## wd_secQ3 -1.897e-01 3.846e-03 -49.325 < 2e-16 ***
## wd_secQ4 -1.090e-01 3.441e-03 -31.663 < 2e-16 ***
## ws -6.128e-02 4.076e-04 -150.319 < 2e-16 ***
## I(1/MinDist^2) 1.112e-04 1.961e-06 56.689 < 2e-16 ***
## Converted_Angle -5.775e-03 1.129e-04 -51.138 < 2e-16 ***
## as.factor(weekday).L 9.867e-02 2.801e-03 35.228 < 2e-16 ***
## as.factor(weekday).Q -1.699e-01 2.808e-03 -60.505 < 2e-16 ***
## as.factor(weekday).C -3.883e-03 2.803e-03 -1.385 0.166
## as.factor(weekday)^4 -2.054e-02 2.814e-03 -7.299 2.9e-13 ***
## as.factor(weekday)^5 -5.798e-02 2.807e-03 -20.654 < 2e-16 ***
## as.factor(weekday)^6 -2.507e-02 2.820e-03 -8.891 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.997 8 2708 <2e-16 ***
## s(latitude.x,longitude.x) 8.999 9 6588 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.156 Deviance explained = 15.6%
## GCV = 0.86621 Scale est. = 0.86618 n = 772251
plot(h2s_model_c)
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_sec + ws + downwind +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(latitude.x, longitude.x, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind + I(1/MinDist^2) + dist_wrp + capacity +
## Converted_Angle + s(latitude.x, longitude.x, bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.756e+00 5.797e-02 -47.539 < 2e-16 ***
## year2023 -1.351e-01 6.861e-03 -19.686 < 2e-16 ***
## as.factor(weekday).L 1.085e-01 2.987e-03 36.336 < 2e-16 ***
## as.factor(weekday).Q -1.832e-01 2.993e-03 -61.217 < 2e-16 ***
## as.factor(weekday).C -2.704e-04 2.979e-03 -0.091 0.928
## as.factor(weekday)^4 -1.483e-02 2.981e-03 -4.973 6.58e-07 ***
## as.factor(weekday)^5 -5.911e-02 2.973e-03 -19.881 < 2e-16 ***
## as.factor(weekday)^6 -2.863e-02 2.983e-03 -9.595 < 2e-16 ***
## wd_secQ2 -1.324e-01 4.002e-03 -33.095 < 2e-16 ***
## wd_secQ3 -1.039e-01 4.279e-03 -24.286 < 2e-16 ***
## wd_secQ4 -3.219e-02 3.895e-03 -8.265 < 2e-16 ***
## ws -7.034e-02 4.326e-04 -162.612 < 2e-16 ***
## downwindTRUE 2.411e-01 5.507e-03 43.776 < 2e-16 ***
## I(1/MinDist^2) 4.741e-05 9.877e-07 48.003 < 2e-16 ***
## dist_wrp 5.474e-04 6.056e-06 90.393 < 2e-16 ***
## capacity 4.510e-03 4.526e-05 99.644 < 2e-16 ***
## Converted_Angle -2.946e-03 1.230e-04 -23.948 < 2e-16 ***
## monthly_oil_1km 5.178e-05 2.487e-06 20.823 < 2e-16 ***
## monthly_gas_1km 2.548e-04 1.226e-05 20.794 < 2e-16 ***
## active_1km -2.875e-02 1.075e-03 -26.736 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8.000 988.6 <2e-16 ***
## s(latitude.x,longitude.x) 8.003 8.004 5108.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 35/37
## R-sq.(adj) = 0.163 Deviance explained = 16.3%
## GCV = 0.89931 Scale est. = 0.89927 n = 711593
plot(h2s_model_d)
# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
I(1/MinDist^2),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg +
## ws_avg + I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.062e+00 3.468e-01 8.829 <2e-16 ***
## year2023 -4.367e-01 2.197e-01 -1.988 0.0469 *
## wd_avg 5.555e-04 1.031e-03 0.539 0.5903
## ws_avg -8.938e-02 6.342e-02 -1.409 0.1588
## I(1/MinDist^2) 7.943e-07 8.452e-08 9.399 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.171 8 2.002 7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 12/13
## R-sq.(adj) = 0.00868 Deviance explained = 1.06%
## GCV = 21.458 Scale est. = 21.408 n = 2664
plot(h2s_dm_model_a)
# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp",
## k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.085e+00 1.486e+01 0.612 0.54088
## wd_avg 2.922e-03 1.008e-03 2.900 0.00376 **
## ws_avg -3.126e-01 6.339e-02 -4.931 8.7e-07 ***
## I(1/MinDist^2) -1.612e-05 1.834e-03 -0.009 0.99299
## RefineryMarathon (Carson) -2.861e-01 1.840e+01 -0.016 0.98759
## RefineryMarathon (Wilmington) -3.058e+00 1.820e+01 -0.168 0.86659
## RefineryPhillips 66 (Wilmington) -1.378e+01 1.702e+01 -0.809 0.41840
## RefineryTorrance Refinery -2.545e+00 1.470e+01 -0.173 0.86261
## RefineryValero Refinery -6.947e+00 1.842e+01 -0.377 0.70604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.026 8.000 1.698 0.000225 ***
## s(monitor_lat,monitor_lon) 5.408 5.699 9.292 6.79e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 25/26
## R-sq.(adj) = 0.127 Deviance explained = 13.1%
## GCV = 18.972 Scale est. = 18.862 n = 2664
plot(h2s_dm_model_b)
# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Converted_Angle+
s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.630e+00 8.823e-01 4.114 4e-05 ***
## wd_avg 2.807e-03 1.012e-03 2.775 0.00555 **
## ws_avg -2.243e-01 6.210e-02 -3.612 0.00031 ***
## I(1/MinDist^2) -8.554e-06 3.810e-05 -0.225 0.82237
## Converted_Angle -3.393e-03 3.891e-03 -0.872 0.38337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.103 8.000 2.015 5.44e-05 ***
## s(monitor_lat,monitor_lon) 7.070 7.918 41.050 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 21/22
## R-sq.(adj) = 0.116 Deviance explained = 12.1%
## GCV = 19.175 Scale est. = 19.08 n = 2664
plot(h2s_dm_model_c)
# Try to include as many variables as possible
h2s_dm_model_d <- gam(H2S_daily_max ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(monitor_lat, monitor_lon, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind + I(1/MinDist^2) + dist_wrp +
## capacity + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.147e+01 4.574e+00 -2.508 0.012213 *
## year2023 -4.245e-01 4.427e-01 -0.959 0.337745
## as.factor(weekday).L 4.659e-01 2.417e-01 1.927 0.054043 .
## as.factor(weekday).Q -9.292e-01 2.421e-01 -3.837 0.000128 ***
## as.factor(weekday).C -1.894e-02 2.403e-01 -0.079 0.937178
## as.factor(weekday)^4 -1.241e-01 2.398e-01 -0.517 0.604921
## as.factor(weekday)^5 -3.749e-01 2.391e-01 -1.568 0.116965
## as.factor(weekday)^6 -1.976e-01 2.395e-01 -0.825 0.409537
## wd_avg 4.695e-03 1.166e-03 4.025 5.88e-05 ***
## ws_avg -3.350e-01 6.991e-02 -4.792 1.75e-06 ***
## daily_downwindTRUE 1.737e+00 5.757e-01 3.017 0.002578 **
## I(1/MinDist^2) 1.409e-04 3.917e-04 0.360 0.719113
## dist_wrp 2.149e-03 6.843e-04 3.141 0.001704 **
## capacity 8.174e-03 1.179e-02 0.694 0.488028
## Converted_Angle 6.316e-03 9.376e-03 0.674 0.500615
## monthly_oil_1km -3.758e-05 1.289e-04 -0.292 0.770662
## monthly_gas_1km -2.956e-04 5.914e-04 -0.500 0.617280
## active_1km 9.034e-03 4.265e-02 0.212 0.832249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 1.710 8.00 0.941 0.00534 **
## s(monitor_lat,monitor_lon) 7.761 7.97 12.285 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.139 Deviance explained = 14.8%
## GCV = 20.146 Scale est. = 19.927 n = 2428
plot(h2s_dm_model_d)
# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(monitor_lat, monitor_lon, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.219e+00 6.207e-01 -8.409 < 2e-16 ***
## year2023 -1.322e-01 6.003e-02 -2.203 0.02769 *
## as.factor(weekday).L 8.104e-02 2.530e-02 3.204 0.00138 **
## as.factor(weekday).Q -1.709e-01 2.531e-02 -6.751 1.84e-11 ***
## as.factor(weekday).C 2.172e-02 2.511e-02 0.865 0.38719
## as.factor(weekday)^4 -2.923e-03 2.505e-02 -0.117 0.90713
## as.factor(weekday)^5 -5.814e-02 2.498e-02 -2.327 0.02003 *
## as.factor(weekday)^6 -5.292e-02 2.502e-02 -2.115 0.03451 *
## wd_avg 1.093e-03 1.223e-04 8.939 < 2e-16 ***
## ws_avg -1.150e-01 7.546e-03 -15.245 < 2e-16 ***
## daily_downwindTRUE 5.224e-01 6.029e-02 8.665 < 2e-16 ***
## I(1/MinDist^2) 3.034e-03 2.794e-04 10.859 < 2e-16 ***
## I(1/dist_wrp^2) -3.284e-04 3.028e-05 -10.848 < 2e-16 ***
## capacity 1.638e-02 1.201e-03 13.633 < 2e-16 ***
## Converted_Angle -2.831e-03 1.042e-03 -2.716 0.00665 **
## monthly_oil_1km 4.364e-05 1.979e-05 2.205 0.02753 *
## monthly_gas_1km 1.702e-04 1.003e-04 1.696 0.09001 .
## active_1km -1.964e-02 8.525e-03 -2.304 0.02129 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.479 8 9.38 <2e-16 ***
## s(monitor_lat,monitor_lon) 8.987 9 68.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.403 Deviance explained = 41.1%
## GCV = 0.22009 Scale est. = 0.21715 n = 2428
plot(h2s_da_model_a)
h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.299e+00 6.337e-01 3.628 0.000285 ***
## year2021 7.513e+00 3.587e-01 20.948 < 2e-16 ***
## year2022 -9.719e+00 3.990e-01 -24.359 < 2e-16 ***
## year2023 -4.519e+00 5.031e-01 -8.983 < 2e-16 ***
## wd_secQ2 -1.993e-01 3.750e-01 -0.531 0.595141
## wd_secQ3 3.380e+00 3.813e-01 8.865 < 2e-16 ***
## wd_secQ4 -1.999e+00 3.604e-01 -5.546 2.93e-08 ***
## ws 1.983e-01 4.050e-02 4.898 9.68e-07 ***
## I(1/MinDist^2) 5.375e+05 3.251e+05 1.653 0.098241 .
## RefineryMarathon (Carson) 4.189e+01 5.190e-01 80.710 < 2e-16 ***
## RefineryMarathon (Wilmington) 4.183e+00 6.036e-01 6.930 4.21e-12 ***
## RefineryPhillips 66 (Wilmington) 3.237e+00 5.332e-01 6.070 1.28e-09 ***
## RefineryTorrance Refinery -3.331e+00 4.910e-01 -6.785 1.16e-11 ***
## RefineryValero Refinery 6.077e+00 5.903e-01 10.294 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 4076 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0271 Deviance explained = 2.71%
## GCV = 24024 Scale est. = 24024 n = 2056378
plot(h2s_ma_model_a)
h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude.x, longitude.x, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude.x, longitude.x,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 247.73043 1330.12474 0.186 0.852252
## year2021 4.46631 0.34810 12.830 < 2e-16 ***
## year2022 -5.16925 0.39625 -13.045 < 2e-16 ***
## year2023 7.01499 0.49759 14.098 < 2e-16 ***
## wd_secQ2 0.75576 0.34603 2.184 0.028956 *
## wd_secQ3 0.04189 0.35248 0.119 0.905406
## wd_secQ4 -1.22491 0.33302 -3.678 0.000235 ***
## ws -0.01374 0.03742 -0.367 0.713562
## I(1/MinDist^2) 1.80216 0.32232 5.591 2.26e-08 ***
## RefineryMarathon (Carson) -768.08881 1478.49393 -0.520 0.603407
## RefineryMarathon (Wilmington) 93.90759 1452.37996 0.065 0.948447
## RefineryPhillips 66 (Wilmington) 0.48764 1513.24530 0.000 0.999743
## RefineryTorrance Refinery -241.61130 1400.50381 -0.173 0.863031
## RefineryValero Refinery -388.51087 1432.72236 -0.271 0.786261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 4850 <2e-16 ***
## s(latitude.x,longitude.x) 7.000 7 51324 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.172 Deviance explained = 17.2%
## GCV = 20447 Scale est. = 20446 n = 2056378
plot(h2s_ma_model_b)
daily_full <- daily_full %>%
mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
Monitor = str_replace_all(Monitor, ' ', '_'),
weekday = as.character(weekday))
train <- daily_full %>%
filter(Monitor != 'Harbor_Park')
test <- daily_full %>%
filter(Monitor == 'Harbor_Park')
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10,verboseIter=TRUE, search='grid')
fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = fastDummies::dummy_cols(train[complete.cases(train),] %>%
# ungroup() %>%
# filter(day >= '2022-02-01') %>%
# select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
# monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max)) %>%
# mutate(MinDist = 1/(MinDist^2)),
# remove_selected_columns = TRUE)
# ,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
# Get testing performance on Harbor Park monitor
predictions <- predict(fit.xgb_da$finalModel,
newdata = fastDummies::dummy_cols(test[complete.cases(test),] %>%
ungroup() %>%
filter(day >= '2022-02-01') %>%
select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max, H2S_daily_avg)) %>%
mutate(MinDist = 1/(MinDist^2),
daily_downwindTRUE = if_else(daily_downwind, 1, 0)) %>%
select(-daily_downwind),
remove_selected_columns = TRUE) %>% as.matrix())
R2(test[complete.cases(test),] %>% filter(day >= '2022-02-01') %>% pull(H2S_daily_avg), predictions)
## [1] 0.2580564
fit.xgb_da$finalModel
## ##### xgb.Booster
## raw: 478 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 33
## niter: 200
## nfeatures : 33
## xNames : monitor_lat monitor_lon MinDist Converted_Angle monthly_oil_1km monthly_gas_1km dist_wrp capacity active_1km ws_avg wd_avg daily_downwindTRUE year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 95 200 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top=10)
matrix <- fastDummies::dummy_cols(train[complete.cases(train),] %>%
ungroup() %>%
filter(day >= '2022-02-01') %>%
select(-c(Refinery, Monitor, day, active_2p5km, active_5km,
monthly_oil_2p5km, monthly_gas_2p5km, monthly_oil_5km,
monthly_gas_5km, H2S_daily_max, H2S_daily_avg)) %>%
mutate(MinDist = 1/(MinDist^2),
daily_downwindTRUE = if_else(daily_downwind, 1, 0)) %>%
select(-daily_downwind),
remove_selected_columns = TRUE)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 43
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in sqrt(sum.squares/one.delta): NaNs produced
xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da$finalModel, top_n = 10)